# common imports
import pandas as pd
import numpy as np
# import data - use dtype argument due to large file size
nyc_full = pd.read_csv('NYC_Payroll_Data.csv', dtype={'Fiscal Year': 'int', 'Payroll Number': 'float', 'Agency Name': 'str',
'Last Name': 'str','First Name': 'str','Mid Init': 'str',
'Agency Start Date': 'str','Work Location Borough': 'str','Title Description': 'str',
'Leave Status as of June 30': 'str','Base Salary': 'float','Pay Basis': 'str',
'Regular Hours': 'float', 'Regular Gross Paid': 'float', 'OT Hours': 'float',
'Total OT Paid': 'float', 'Total Other Pay': 'float'})
# make copy of data so you do not need to import again if you make a mistake and need to "reset"
nyc = nyc_full.copy()
# check information about DataFrame
nyc.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4496767 entries, 0 to 4496766 Data columns (total 17 columns): # Column Dtype --- ------ ----- 0 Fiscal Year int32 1 Payroll Number float64 2 Agency Name object 3 Last Name object 4 First Name object 5 Mid Init object 6 Agency Start Date object 7 Work Location Borough object 8 Title Description object 9 Leave Status as of June 30 object 10 Base Salary float64 11 Pay Basis object 12 Regular Hours float64 13 Regular Gross Paid float64 14 OT Hours float64 15 Total OT Paid float64 16 Total Other Pay float64 dtypes: float64(7), int32(1), object(9) memory usage: 566.1+ MB
# check first five rows of data
nyc.head()
Fiscal Year | Payroll Number | Agency Name | Last Name | First Name | Mid Init | Agency Start Date | Work Location Borough | Title Description | Leave Status as of June 30 | Base Salary | Pay Basis | Regular Hours | Regular Gross Paid | OT Hours | Total OT Paid | Total Other Pay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2020 | 17.0 | OFFICE OF EMERGENCY MANAGEMENT | BEREZIN | MIKHAIL | NaN | 08/10/2015 | BROOKLYN | EMERGENCY PREPAREDNESS MANAGER | ACTIVE | 86005.0 | per Annum | 1820.0 | 84698.21 | 0.0 | 0.0 | 0.0 |
1 | 2020 | 17.0 | OFFICE OF EMERGENCY MANAGEMENT | GEAGER | VERONICA | M | 09/12/2016 | BROOKLYN | EMERGENCY PREPAREDNESS MANAGER | ACTIVE | 86005.0 | per Annum | 1820.0 | 84698.21 | 0.0 | 0.0 | 0.0 |
2 | 2020 | 17.0 | OFFICE OF EMERGENCY MANAGEMENT | RAMANI | SHRADDHA | NaN | 02/22/2016 | BROOKLYN | EMERGENCY PREPAREDNESS MANAGER | ACTIVE | 86005.0 | per Annum | 1820.0 | 84698.21 | 0.0 | 0.0 | 0.0 |
3 | 2020 | 17.0 | OFFICE OF EMERGENCY MANAGEMENT | ROTTA | JONATHAN | D | 09/16/2013 | BROOKLYN | EMERGENCY PREPAREDNESS MANAGER | ACTIVE | 86005.0 | per Annum | 1820.0 | 84698.21 | 0.0 | 0.0 | 0.0 |
4 | 2020 | 17.0 | OFFICE OF EMERGENCY MANAGEMENT | WILSON II | ROBERT | P | 04/30/2018 | BROOKLYN | EMERGENCY PREPAREDNESS MANAGER | ACTIVE | 86005.0 | per Annum | 1820.0 | 84698.21 | 0.0 | 0.0 | 0.0 |
If you were to research the values in the DataFrame further, you will notice that there are some values in lowercase and most are listed in all uppercase. Because of this, run the function below that will transform all characters to uppercase to be consistent with the majority of the data.
# run this code block
def upper_case(df):
'''
Takes as input the `nyc` DataFrame.
Then capitlizes each character in columns with "object" data type.
'''
# iterating over each column in the DataFrame
for col in df.columns:
# checking datatype of each column
if df[col].dtype == 'object':
# applying upper function on column
df[col] = df[col].str.upper()
upper_case(nyc)
Q1: How many rows are contained in the full dataset?
rows = nyc.shape[0]
print("Number of Rows = ", rows)
Number of Rows = 4496767
Q2: How many individuals are represented using only the 2021 fiscal year data?
### ENTER CODE HERE ###
nyc2 = nyc.copy()
nyc2 = nyc2[nyc2['Fiscal Year']==2021]
print("Individuals represented = ",nyc2.shape[0])
Individuals represented = 573477
Q3: Who had the highest base salary in 2021? What was his/her base salary? What Agency did he/she work for? What was his/her title?
### ENTER CODE HERE ###
highest_salary = nyc2['Base Salary'].max()
print("Highest Salary = ",highest_salary)
Highest Salary = 414707.0
Q4: List the top 5 employees with the highest regular gross pay in 2021.
### ENTER CODE HERE ###
top_5 = nyc2.sort_values(by = 'Regular Gross Paid',ascending = False)
top_5 = nyc2.head()
top_5
Fiscal Year | Payroll Number | Agency Name | Last Name | First Name | Mid Init | Agency Start Date | Work Location Borough | Title Description | Leave Status as of June 30 | Base Salary | Pay Basis | Regular Hours | Regular Gross Paid | OT Hours | Total OT Paid | Total Other Pay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3923290 | 2021 | 996.0 | NYC HOUSING AUTHORITY | MUSTACIUOLO | VITO | J | 02/26/2018 | MANHATTAN | EXECUTIVE DIRECTOR | ACTIVE | 258000.0 | PER ANNUM | 1820.0 | 257260.30 | 0.00 | 0.00 | 258000.00 |
3923291 | 2021 | 996.0 | NYC HOUSING AUTHORITY | RUSS | GREGORY | P | 08/12/2019 | MANHATTAN | CHAIR | ACTIVE | 414707.0 | PER ANNUM | 1820.0 | 413518.05 | 0.00 | 0.00 | 500.00 |
3923292 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | MCGROARTY | MICHAEL | NaN | 10/06/2014 | QUEENS | STATIONARY ENGINEER | ACTIVE | 508.8 | PER DAY | 2080.0 | 132288.00 | 2374.75 | 238829.13 | 40105.00 |
3923293 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | HALLAHAN | PATRICK | M | 02/26/2018 | BROOKLYN | STATIONARY ENGINEER | ACTIVE | 508.8 | PER DAY | 2080.0 | 132288.00 | 2115.25 | 218628.18 | 56616.07 |
3923294 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | PETTIT | PATRICK | J | 08/02/2010 | MANHATTAN | STATIONARY ENGINEER | ACTIVE | 508.8 | PER DAY | 2080.0 | 132288.00 | 2152.75 | 218694.96 | 38611.82 |
Q5: Which employee had the highest total OT paid in 2021?
### ENTER CODE HERE ###
highest_OT = (nyc2[nyc2['Total OT Paid'] == nyc2['Total OT Paid'].max()])
highest_OT
Fiscal Year | Payroll Number | Agency Name | Last Name | First Name | Mid Init | Agency Start Date | Work Location Borough | Title Description | Leave Status as of June 30 | Base Salary | Pay Basis | Regular Hours | Regular Gross Paid | OT Hours | Total OT Paid | Total Other Pay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3923298 | 2021 | 996.0 | NYC HOUSING AUTHORITY | PROCIDA | ROBERT | NaN | 04/13/1987 | BRONX | SUPERVISOR PLUMBER | ACTIVE | 387.03 | PER DAY | 1820.0 | 100627.8 | 2249.5 | 248749.72 | 7215.34 |
Q6: Which employee that was still active as of June 30th had the highest total other pay in 2021?
### ENTER CODE HERE ###
highest_otherpay = (nyc2[nyc2['Leave Status as of June 30'] == 'ACTIVE'])
highest_otherpay = (highest_otherpay[highest_otherpay['Total Other Pay'] == highest_otherpay['Total Other Pay'].max()])
highest_otherpay
Fiscal Year | Payroll Number | Agency Name | Last Name | First Name | Mid Init | Agency Start Date | Work Location Borough | Title Description | Leave Status as of June 30 | Base Salary | Pay Basis | Regular Hours | Regular Gross Paid | OT Hours | Total OT Paid | Total Other Pay | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3923290 | 2021 | 996.0 | NYC HOUSING AUTHORITY | MUSTACIUOLO | VITO | J | 02/26/2018 | MANHATTAN | EXECUTIVE DIRECTOR | ACTIVE | 258000.0 | PER ANNUM | 1820.0 | 257260.3 | 0.0 | 0.0 | 258000.0 |
Q7: According to the Data Dictionary, an employee's total gross pay is equal to the sum of their regular gross pay, total overtime pay, and total other pay. Create a column called Total Gross Paid
in the nyc
DataFrame that sums those three respective pay columns for each employee.
Which 10 individuals had the highest total gross pay in 2021?
### ENTER CODE HERE ###
nyc2['Total Gross Paid'] = nyc2['Regular Gross Paid']+nyc2['Total OT Paid']+nyc2['Total Other Pay']
top_10_gross = nyc2.sort_values(by = 'Regular Gross Paid',ascending = False)
top_10_gross = nyc2.head(10)
top_10_gross
Fiscal Year | Payroll Number | Agency Name | Last Name | First Name | Mid Init | Agency Start Date | Work Location Borough | Title Description | Leave Status as of June 30 | Base Salary | Pay Basis | Regular Hours | Regular Gross Paid | OT Hours | Total OT Paid | Total Other Pay | Total Gross Paid | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3923290 | 2021 | 996.0 | NYC HOUSING AUTHORITY | MUSTACIUOLO | VITO | J | 02/26/2018 | MANHATTAN | EXECUTIVE DIRECTOR | ACTIVE | 258000.00 | PER ANNUM | 1820.0 | 257260.30 | 0.00 | 0.00 | 258000.00 | 515260.30 |
3923291 | 2021 | 996.0 | NYC HOUSING AUTHORITY | RUSS | GREGORY | P | 08/12/2019 | MANHATTAN | CHAIR | ACTIVE | 414707.00 | PER ANNUM | 1820.0 | 413518.05 | 0.00 | 0.00 | 500.00 | 414018.05 |
3923292 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | MCGROARTY | MICHAEL | NaN | 10/06/2014 | QUEENS | STATIONARY ENGINEER | ACTIVE | 508.80 | PER DAY | 2080.0 | 132288.00 | 2374.75 | 238829.13 | 40105.00 | 411222.13 |
3923293 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | HALLAHAN | PATRICK | M | 02/26/2018 | BROOKLYN | STATIONARY ENGINEER | ACTIVE | 508.80 | PER DAY | 2080.0 | 132288.00 | 2115.25 | 218628.18 | 56616.07 | 407532.25 |
3923294 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | PETTIT | PATRICK | J | 08/02/2010 | MANHATTAN | STATIONARY ENGINEER | ACTIVE | 508.80 | PER DAY | 2080.0 | 132288.00 | 2152.75 | 218694.96 | 38611.82 | 389594.78 |
3923295 | 2021 | 816.0 | DEPT OF HEALTH/MENTAL HYGIENE | TELEHANY | STEPHEN | M | 01/16/2007 | QUEENS | STATIONARY ENGINEER | ACTIVE | 508.80 | PER DAY | 2080.0 | 132288.00 | 1876.25 | 192296.19 | 51160.20 | 375744.39 |
3923296 | 2021 | 462.0 | GUTTMAN COMMUNITY COLLEGE | EVENBECK | SCOTT | E | 04/17/2011 | MANHATTAN | PRESIDENT | CEASED | 228000.00 | PER ANNUM | 980.0 | 122427.81 | 0.00 | 0.00 | 244495.20 | 366923.01 |
3923297 | 2021 | 56.0 | POLICE DEPARTMENT | NaN | NaN | NaN | 07/16/1984 | MANHATTAN | CAPTAIN DETAILED AS CHIEF OF TRAINING | CEASED | 241116.00 | PER ANNUM | 400.0 | 46222.47 | 0.00 | 0.00 | 312126.91 | 358349.38 |
3923298 | 2021 | 996.0 | NYC HOUSING AUTHORITY | PROCIDA | ROBERT | NaN | 04/13/1987 | BRONX | SUPERVISOR PLUMBER | ACTIVE | 387.03 | PER DAY | 1820.0 | 100627.80 | 2249.50 | 248749.72 | 7215.34 | 356592.86 |
3923299 | 2021 | 15.0 | OFFICE OF THE COMPTROLLER | DONE | ALEXIS | NaN | 03/12/2012 | MANHATTAN | PENSION INVESTMENT ADVISOR | ACTIVE | 350000.00 | PER ANNUM | 1820.0 | 349014.96 | 0.00 | 0.00 | 0.00 | 349014.96 |
Q8: What is the average, median, and standard deviation for total gross pay for all individuals in 2021?
### ENTER CODE HERE ###
total_gross_pay = nyc2['Total Gross Paid']
print(f'average,: {total_gross_pay.mean()}')
print(f'median: {total_gross_pay.median()}')
print(f'standard deviation: {total_gross_pay.std()}')
average,: 52017.993508579806 median: 43359.15 standard deviation: 50730.46191268696
Q9: How many different agencies are represented in the 2021 data? What are the names of the agencies?
### ENTER CODE HERE ###
print(nyc2['Agency Name'].nunique())
print(nyc2['Agency Name'].unique())
156 ['NYC HOUSING AUTHORITY' 'DEPT OF HEALTH/MENTAL HYGIENE' 'GUTTMAN COMMUNITY COLLEGE' 'POLICE DEPARTMENT' 'OFFICE OF THE COMPTROLLER' 'BOARD OF ELECTION' "ADMIN FOR CHILDREN'S SVCS" 'DEPARTMENT OF CORRECTION' 'DEPT OF CITYWIDE ADMIN SVCS' 'DEPT OF ENVIRONMENT PROTECTION' 'OFFICE OF THE ACTUARY' 'DEPARTMENT OF FINANCE' 'DEPARTMENT OF SANITATION' 'COMMUNITY COLLEGE (QUEENSBORO)' 'FIRE DEPARTMENT' 'DEPT OF ED PEDAGOGICAL' 'OFFICE OF THE MAYOR' 'COMMUNITY COLLEGE (BRONX)' 'COMMUNITY COLLEGE (KINGSBORO)' 'COMMUNITY COLLEGE (LAGUARDIA)' 'COMMUNITY COLLEGE (MANHATTAN)' 'DISTRICT ATTORNEY-MANHATTAN' 'CAMPAIGN FINANCE BOARD' 'COMMUNITY COLLEGE (HOSTOS)' 'DEPT OF PARKS & RECREATION' 'DEPARTMENT OF EDUCATION ADMIN' 'DEPARTMENT OF TRANSPORTATION' 'CITY COUNCIL' 'HRA/DEPT OF SOCIAL SERVICES' 'TEACHERS RETIREMENT SYSTEM' 'TAXI & LIMOUSINE COMMISSION' 'NYC EMPLOYEES RETIREMENT SYS' 'HOUSING PRESERVATION & DVLPMNT' 'DEPT. OF HOMELESS SERVICES' 'DEPARTMENT OF CITY PLANNING' 'OFFICE OF COLLECTIVE BARGAININ' 'BUSINESS INTEGRITY COMMISSION' 'LAW DEPARTMENT' 'FINANCIAL INFO SVCS AGENCY' 'OFFICE OF LABOR RELATIONS' 'DEPARTMENT OF BUILDINGS' 'DEPT OF INFO TECH & TELECOMM' 'CITY CLERK' 'DEPARTMENT FOR THE AGING' 'NYC POLICE PENSION FUND' 'OFF OF PAYROLL ADMINISTRATION' 'OFFICE OF MANAGEMENT & BUDGET' 'DEPARTMENT OF INVESTIGATION' 'BRONX DISTRICT ATTORNEY' 'HUMAN RIGHTS COMMISSION' 'DEPARTMENT OF PROBATION' 'DEPARTMENT OF BUSINESS SERV.' 'TAX COMMISSION' 'MAYORS OFFICE OF CONTRACT SVCS' 'CULTURAL AFFAIRS' 'NYC FIRE PENSION FUND' 'DEPT OF YOUTH & COMM DEV SRVS' 'ADMIN TRIALS AND HEARINGS' "NYC DEPT OF VETERANS' SERVICES" 'CIVILIAN COMPLAINT REVIEW BD' 'DISTRICT ATTORNEY KINGS COUNTY' 'INDEPENDENT BUDGET OFFICE' 'CONFLICTS OF INTEREST BOARD' 'BOROUGH PRESIDENT-BRONX' 'DEPT. OF DESIGN & CONSTRUCTION' 'LANDMARKS PRESERVATION COMM' 'DISTRICT ATTORNEY RICHMOND COU' 'DISTRICT ATTORNEY-SPECIAL NARC' 'DISTRICT ATTORNEY QNS COUNTY' 'DEPT OF RECORDS & INFO SERVICE' 'CONSUMER AFFAIRS' 'OFFICE OF EMERGENCY MANAGEMENT' 'BOARD OF CORRECTION' 'HUNTER COLLEGE HIGH SCHOOL' 'PUBLIC ADMINISTRATOR-NEW YORK' 'PUBLIC ADMINISTRATOR-BRONX' 'PUBLIC ADMINISTRATOR-KINGS' 'PUBLIC ADMINISTRATOR-QUEENS' 'PUBLIC ADMINISTRATOR-RICHMOND' 'PUBLIC ADVOCATE' 'MUNICIPAL WATER FIN AUTHORITY' 'DOE CUSTODIAL PAYROL' 'BROOKLYN COMMUNITY BOARD #18' 'PRESIDENT BOROUGH OF MANHATTAN' 'BOROUGH PRESIDENT-BROOKLYN' 'BOROUGH PRESIDENT-STATEN IS' 'CUNY CENTRAL OFFICE' 'QUEENS COMMUNITY BOARD #14' 'BRONX COMMUNITY BOARD #1' 'CIVIL SERVICE COMMISSION' 'BROOKLYN COMMUNITY BOARD #1' 'BRONX COMMUNITY BOARD #3' 'BRONX COMMUNITY BOARD #12' 'BROOKLYN COMMUNITY BOARD #10' 'QUEENS COMMUNITY BOARD #8' 'BRONX COMMUNITY BOARD #2' 'BROOKLYN COMMUNITY BOARD #12' 'BOROUGH PRESIDENT-QUEENS' 'EQUAL EMPLOY PRACTICES COMM' 'BRONX COMMUNITY BOARD #9' 'QUEENS COMMUNITY BOARD #12' 'BRONX COMMUNITY BOARD #6' 'BROOKLYN COMMUNITY BOARD #16' 'BROOKLYN COMMUNITY BOARD #8' 'BROOKLYN COMMUNITY BOARD #14' 'QUEENS COMMUNITY BOARD #6' 'QUEENS COMMUNITY BOARD #13' 'QUEENS COMMUNITY BOARD #3' 'STATEN ISLAND COMMUNITY BD #2' 'QUEENS COMMUNITY BOARD #2' 'DEPT OF ED PER DIEM TEACHERS' 'STATEN ISLAND COMMUNITY BD #3' 'QUEENS COMMUNITY BOARD #4' 'QUEENS COMMUNITY BOARD #1' 'QUEENS COMMUNITY BOARD #10' 'BROOKLYN COMMUNITY BOARD #7' 'BROOKLYN COMMUNITY BOARD #11' 'MANHATTAN COMMUNITY BOARD #11' 'DEPT OF ED PER SESSION TEACHER' 'QUEENS COMMUNITY BOARD #5' 'STATEN ISLAND COMMUNITY BD #1' 'BRONX COMMUNITY BOARD #4' 'MANHATTAN COMMUNITY BOARD #4' 'MANHATTAN COMMUNITY BOARD #9' 'MANHATTAN COMMUNITY BOARD #12' 'BROOKLYN COMMUNITY BOARD #5' 'MANHATTAN COMMUNITY BOARD #6' 'BRONX COMMUNITY BOARD #11' 'BROOKLYN COMMUNITY BOARD #13' 'QUEENS COMMUNITY BOARD #7' 'BROOKLYN COMMUNITY BOARD #3' 'MANHATTAN COMMUNITY BOARD #3' 'BROOKLYN COMMUNITY BOARD #6' 'MANHATTAN COMMUNITY BOARD #2' 'MANHATTAN COMMUNITY BOARD #1' 'BRONX COMMUNITY BOARD #8' 'MANHATTAN COMMUNITY BOARD #8' 'DEPT OF ED PARA PROFESSIONALS' 'BROOKLYN COMMUNITY BOARD #17' 'MANHATTAN COMMUNITY BOARD #10' 'QUEENS COMMUNITY BOARD #11' 'MANHATTAN COMMUNITY BOARD #7' 'BRONX COMMUNITY BOARD #10' 'BROOKLYN COMMUNITY BOARD #4' 'BROOKLYN COMMUNITY BOARD #9' 'BRONX COMMUNITY BOARD #5' 'QUEENS COMMUNITY BOARD #9' 'DEPT OF ED HRLY SUPPORT STAFF' 'MANHATTAN COMMUNITY BOARD #5' 'BRONX COMMUNITY BOARD #7' 'BROOKLYN COMMUNITY BOARD #15' 'BROOKLYN COMMUNITY BOARD #2' 'BOARD OF ELECTION POLL WORKERS' 'PERSONNEL MONITORS' 'PUBLIC SERVICE CORPS' 'DEPARTMENT OF JUVENILE JUSTICE']
Q10: What agencies had the highest and lowest median total gross pay in 2021?
### ENTER CODE HERE ###
agencies = nyc2.groupby('Agency Name')['Total Gross Paid'].median()
max_agency = agencies[agencies == agencies.max()]
#print("Max:", max_agency)
#print("Min: ",agencies[agencies == agencies.min()])
Q11: How much did Mayor Bill de Blasio make in 2021 (total gross pay)? Who had the highest total gross pay in the Mayor's office? When was Mayor Bill de Blasio's start date in the Mayor's office?
### ENTER CODE HERE ###
mayor_bill = (nyc2[nyc2['First Name'] == 'BILL'])
mayor_bill = (mayor_bill[mayor_bill['Last Name'] == 'DE BLASIO'])
mayor_bill_gross = mayor_bill['Total Gross Paid'].sum()
mayor_bill_startdate = mayor_bill['Agency Start Date'].sum()
print("Mayor Bill Start Date: ",mayor_bill_startdate)
print("Mayor Bill Gross Pay: ",mayor_bill_gross)
mayor_office = (nyc2[nyc2['Agency Name'] == 'OFFICE OF THE MAYOR'])
mayor_office = mayor_office[mayor_office['Total Gross Paid'] == mayor_office['Total Gross Paid'].max()]
print("Higest Gross Pay in Mayors office:", mayor_office['First Name'].max(), mayor_office['Last Name'].max())
Mayor Bill Start Date: 01/01/2014 Mayor Bill Gross Pay: 253064.89 Higest Gross Pay in Mayors office: DEAN FULEIHAN
Q12: How much was the total cumulative gross pay for all NYC employees in 2021? Total cumulative OT pay? What was the percentage of overtime pay as compared to total gross pay for all data in 2021?
### ENTER CODE HERE ###
total_cumulative_gross_pay = nyc2['Total Gross Paid'].sum()
total_cumulative_OT_pay = nyc2['Total OT Paid'].sum()
percentage = total_cumulative_OT_pay/total_cumulative_gross_pay*100
print(total_cumulative_gross_pay)
print(total_cumulative_OT_pay)
print(percentage)
29831122863.320004 1796557247.7300003 6.022425826749639
Q13: What was the average base salary per work location in 2021?
### ENTER CODE HERE ###
avg_salary_location = nyc2.groupby('Work Location Borough')['Base Salary'].mean().sort_values(ascending = False)
avg_salary_location
Work Location Borough WASHINGTON DC 133559.666667 OTHER 120124.578298 ALBANY 91296.344000 ULSTER 79805.588939 GREENE 72366.213333 SULLIVAN 72212.089297 SCHOHARIE 70127.428571 WESTCHESTER 68429.795154 DELAWARE 65365.190366 PUTNAM 63959.932895 DUTCHESS 60963.991379 BROOKLYN 59288.815687 QUEENS 58846.691720 RICHMOND 58241.712893 ORANGE 57403.000000 BRONX 55290.460386 MANHATTAN 43044.492124 NASSAU 29930.476500 Name: Base Salary, dtype: float64
Q14: For the work location that has the highest average base salary above, which individual in that location makes the highest base salary?
### ENTER CODE HERE ###
highest_base_salary_location = nyc2[nyc2['Work Location Borough'] == 'WASHINGTON DC']
highest_base_salary_location = highest_base_salary_location[highest_base_salary_location['Base Salary'] == highest_base_salary_location['Base Salary'].max()]
print(highest_base_salary_location['First Name'].max(),highest_base_salary_location['Last Name'].max())
REBECCA KAGAN STERNHELL
Q15: Group the 2021 data by work location and then agency name. What are the five largest median base salaries from that grouped data?
### ENTER CODE HERE ###
grouped_location_agency = nyc2.groupby(['Work Location Borough','Agency Name'])['Base Salary'].median().sort_values(ascending = False)
grouped_location_agency.head()
Work Location Borough Agency Name WASHINGTON DC DEPT OF INFO TECH & TELECOMM 177904.0 BRONX DEPT OF INFO TECH & TELECOMM 147213.0 COMMUNITY COLLEGE (LAGUARDIA) 147000.0 WASHINGTON DC LAW DEPARTMENT 142057.0 ALBANY OFFICE OF THE MAYOR 130000.0 Name: Base Salary, dtype: float64
Q16: How has the cumulative total gross pay for all individuals changed over time? Be prepared to answer just basic questions such as has it mostly increased or decreased, what year had the highest total gross pay, etc.
### ENTER CODE HERE ###
nyc['Total Gross Paid'] = nyc['Regular Gross Paid']+nyc['Total OT Paid']+nyc['Total Other Pay']
nyc.groupby(['Fiscal Year'])['Total Gross Paid'].sum()
Fiscal Year 2014 2.286248e+10 2015 2.433409e+10 2016 2.551812e+10 2017 2.714594e+10 2018 2.754932e+10 2019 2.951697e+10 2020 3.041857e+10 2021 2.983112e+10 Name: Total Gross Paid, dtype: float64
Ans: Mostly Increased and year 2020 had the highest total gross pay
Q17: How has the total number of employees for each fiscal year changed over time? Again, be prepared to answer just some basic questions.
### ENTER CODE HERE ###
nyc.groupby(['Fiscal Year'])['First Name'].count()
Fiscal Year 2014 509387 2015 577636 2016 544508 2017 562005 2018 545798 2019 592048 2020 589835 2021 568889 Name: First Name, dtype: int64
Ans: Number of employees have varied overtime but there is a overall increase in number of employees from 2014 to 2021
Q18: Who are the 2 top base salary employees per work location borough for 2021? You may want to think about creating a custom function to make this easier to answer.
### ENTER CODE HERE ###
top_base = nyc2.groupby(['Work Location Borough','First Name','Last Name'])['Base Salary'].max().sort_values(ascending = False)
top_base.groupby(level=0).head(2)
Work Location Borough First Name Last Name MANHATTAN GREGORY RUSS 414707.0 MEISHA ROSS PORTER 363346.0 QUEENS CHRISTINE MANGINO 270000.0 KENNETH ADAMS 270000.0 BRONX DAISY DE FILIPPIS 260000.0 BROOKLYN JOHN SCRIVANI 243171.0 DANIEL NIGRO 243171.0 OTHER DONALD CONYERS 241102.0 RICHMOND FRANK VEGA 240511.0 BRONX KENNETH LEHR 240511.0 RICHMOND KEVIN WOODS 235462.0 SULLIVAN PAUL RUSH 231796.0 OTHER LAURA FEIJOO 225666.0 ALBANY SIMONIA BROWN 203996.0 SULLIVAN SEAN MCANDREW 195694.0 WESTCHESTER DAVID WARNE 195295.0 ADAM REAVES 191752.0 ULSTER STEVEN SCHINDLER 188100.0 DELAWARE JOHN VICKERS 188099.0 WASHINGTON DC REBECCA KAGAN STERNHELL 178190.0 MAX SEVILLIA 177904.0 ULSTER LORI EMERY 177507.0 DUTCHESS GEORGE SCHMITT 170804.0 FRANK MILAZZO 166507.0 ALBANY ARIANA CAPLAN 161136.0 SCHOHARIE BRIAN HANDY 150393.0 GREENE LARRY ARNOLD 137534.0 DELAWARE ROBERT FLYNN III 134280.0 PUTNAM JAMES KEESLER 126624.0 SCHOHARIE EMORY CHASE 112561.0 PUTNAM PATRICK FRAWLEY 91648.0 GREENE JOSHUA JONES 91551.0 NASSAU LUANNE HORNE 81186.0 LORETTA CONROY 81186.0 ORANGE ESTER CROSS 57403.0 Name: Base Salary, dtype: float64
Q19: Who are the 2 top total gross paid employees per borough for 2021? If you made a custom function for the previous question, this will be much easier to answer.
### ENTER CODE HERE ###
top_gross = nyc2.groupby(['Work Location Borough','First Name','Last Name'])['Total Gross Paid'].max().sort_values(ascending = False)
top_gross.groupby(level=0).head(2)
Work Location Borough First Name Last Name MANHATTAN VITO MUSTACIUOLO 515260.30 GREGORY RUSS 414018.05 QUEENS MICHAEL MCGROARTY 411222.13 BROOKLYN PATRICK HALLAHAN 407532.25 QUEENS STEPHEN TELEHANY 375744.39 BRONX ROBERT PROCIDA 356592.86 GARFIELD DALEY 347824.78 BROOKLYN CHRISTOPHER REINHOLD 347654.07 RICHMOND JOSEPH ALBANESE 276537.47 WILLIAM HORAN 264396.65 OTHER DANIEL SCANLON 263786.62 JOAN INDART ETIENNE 247171.40 SULLIVAN PAUL RUSH 226703.15 WESTCHESTER DANIEL MASSI 200973.04 RICHARD VENTURA 199641.63 SULLIVAN SEAN MCANDREW 191479.22 DELAWARE JOHN VICKERS 184966.23 DUTCHESS CHARLES NEWMAN 182907.46 ALBANY SIMONIA BROWN 179952.18 WASHINGTON DC REBECCA KAGAN STERNHELL 174429.45 ULSTER JOSEPH ZIBELLA 169345.53 TODD WEST 168101.51 DUTCHESS WESTON VANVORST 165782.71 SCHOHARIE BRIAN HANDY 148088.64 GREENE ERNEST FANKHAUSER 143733.71 ALBANY MARK FAHD 140780.85 WASHINGTON DC KIMBERLY CONWAY 139480.01 DELAWARE ROBERT BIANCHI 137612.58 PUTNAM WILLIAM CRUGER 133032.34 JAMES KEESLER 126810.39 SCHOHARIE EMORY CHASE 125288.56 GREENE JOSHUA JONES 114065.93 NASSAU SUSAN DEELY 102972.26 LORETTA CONROY 98429.10 ORANGE ESTER CROSS 63313.06 Name: Total Gross Paid, dtype: float64
Q20: For 2021, what percentage of individuals earned income per annum, per day and per hour?
total = nyc2['Pay Basis'].count()
per_annum = nyc2[nyc2['Pay Basis'] == 'PER ANNUM']
per_annum = per_annum['Pay Basis'].count()
per_day = nyc2[nyc2['Pay Basis'] == 'PER DAY']
per_day = per_day['Pay Basis'].count()
per_hour = nyc2[nyc2['Pay Basis'] == 'PER HOUR']
per_hour = per_hour['Pay Basis'].count()
percentage_annum = per_annum/total*100
percentage_day = per_day/total*100
percentage_hour = per_hour/total*100
print('Percentages',percentage_annum,percentage_day,percentage_hour)
Percentages 59.96718264202401 24.03409378231385 15.675606868278939
Q21: What is the average total gross pay broken down by the categories in the Pay Basis
column for 2021?
### ENTER CODE HERE ###
avg_gross_pay_basis = nyc2.groupby('Pay Basis')['Total Gross Paid'].mean()
avg_gross_pay_basis
Pay Basis PER ANNUM 78625.133331 PER DAY 12462.578868 PER HOUR 11223.010701 PRORATED ANNUAL 35334.994949 Name: Total Gross Paid, dtype: float64
Q22: Create a pivot table using the median of total gross pay with the index by agency name and columns broken down by pay basis category. Include margins=True
. Be prepared to answer questions based on this pivot table.
### ENTER CODE HERE ###
output = pd.pivot_table(data=nyc,
index=['Agency Name'],
columns=['Pay Basis'],
values='Total Gross Paid',
aggfunc='median',
margins=True)
output
Pay Basis | PER ANNUM | PER DAY | PER HOUR | PRORATED ANNUAL | All |
---|---|---|---|---|---|
Agency Name | |||||
ADMIN FOR CHILDREN'S SVCS | 63820.190 | 115830.580 | 5492.295 | NaN | 63663.890 |
ADMIN TRIALS AND HEARINGS | 55490.500 | 65795.105 | 28201.850 | NaN | 37206.710 |
BOARD OF CORRECTION | 66263.995 | NaN | NaN | NaN | 66263.995 |
BOARD OF CORRECTIONS | 60042.470 | NaN | NaN | NaN | 60042.470 |
BOARD OF ELECTION | 55180.300 | NaN | 14585.030 | NaN | 37035.145 |
... | ... | ... | ... | ... | ... |
STATEN ISLAND COMMUNITY BD #3 | 64494.630 | NaN | NaN | NaN | 64494.630 |
TAX COMMISSION | 100457.425 | NaN | 13157.480 | 25604.700 | 78020.330 |
TAXI & LIMOUSINE COMMISSION | 48566.750 | 65981.310 | 30615.490 | NaN | 46040.220 |
TEACHERS RETIREMENT SYSTEM | 70234.065 | NaN | 3257.500 | NaN | 66221.770 |
All | 69194.880 | 2807.200 | 1553.000 | 30810.245 | 39110.450 |
165 rows × 5 columns
Q23: What positions (grouped by titles) pay the most based on the median total gross pay in 2021? Show the top 10.
### ENTER CODE HERE ###
nyc2.groupby('Title Description')['Total Gross Paid'].median().sort_values(ascending=False).head(10)
Title Description PENSION INVESTMENT ADVISOR 349014.960 CHAIR 326110.630 CHIEF ACTUARY 305032.320 CAPTAIN DETAILED AS CHIEF OF TRAINING 297703.365 FIRST DEPUTY MAYOR 284742.280 PRESIDENT 275457.880 DIRECTOR OF INVESTMENTS 264254.050 CAPTAIN DETAILED AS CHIEF OF RISK MANAGEMENT 254051.460 MAYOR 253064.890 CHANCELLOR 251872.840 Name: Total Gross Paid, dtype: float64
Q24: Create a set of boxplots that show the total gross pay for all the employees of each of the various work location boroughs. In other words, your x axis will be the work locations and your y axis will be the total gross pay for the respective work location.
### ENTER CODE HERE ###
nyc.boxplot(by ='Work Location Borough', column =['Total Gross Paid'], grid = False, figsize=(20, 10))
<AxesSubplot:title={'center':'Total Gross Paid'}, xlabel='Work Location Borough'>
Q25: What are the top 10 agencies that have the most total number of overtime hours for 2021? What are the top 10 agencies that have the highest average number of overtime hours?
### ENTER CODE HERE ###
nyc2.groupby('Agency Name')['OT Hours'].sum().sort_values(ascending=False).head(10)
Agency Name POLICE DEPARTMENT 10815884.93 FIRE DEPARTMENT 8220712.59 DEPARTMENT OF SANITATION 4206474.56 DEPARTMENT OF CORRECTION 3323294.05 NYC HOUSING AUTHORITY 2686245.50 HRA/DEPT OF SOCIAL SERVICES 1276660.51 DEPARTMENT OF TRANSPORTATION 1074099.24 DEPT OF HEALTH/MENTAL HYGIENE 867652.73 DEPT OF ENVIRONMENT PROTECTION 679410.40 ADMIN FOR CHILDREN'S SVCS 539092.21 Name: OT Hours, dtype: float64
nyc2.groupby('Agency Name')['OT Hours'].mean().sort_values(ascending=False).head(10)
Agency Name BOARD OF ELECTION 454.591618 FIRE DEPARTMENT 431.556123 DEPARTMENT OF SANITATION 347.556355 DEPARTMENT OF CORRECTION 265.184651 DEPT. OF HOMELESS SERVICES 195.478552 NYC HOUSING AUTHORITY 186.973307 POLICE DEPARTMENT 183.292124 DEPT OF CITYWIDE ADMIN SVCS 176.666722 DEPARTMENT OF TRANSPORTATION 172.379913 DEPT OF HEALTH/MENTAL HYGIENE 111.984090 Name: OT Hours, dtype: float64
Q26: Attempt to do this in one line of code by using agg
. For the 2021 data, groupby the agency name and get the mean for the base salary, the median for the total overtime paid, and the standard deviation for the total gross paid. Save this as an object and be prepared to index it.
nyc2.groupby('Agency Name').agg({'Base Salary': 'mean', 'Total OT Paid': 'median', 'Total Gross Paid' : 'std'})
Base Salary | Total OT Paid | Total Gross Paid | |
---|---|---|---|
Agency Name | |||
ADMIN FOR CHILDREN'S SVCS | 68928.295008 | 21.95 | 32124.442208 |
ADMIN TRIALS AND HEARINGS | 42774.316154 | 0.00 | 38677.473864 |
BOARD OF CORRECTION | 94507.127586 | 0.00 | 41924.536647 |
BOARD OF ELECTION | 41241.893459 | 13800.16 | 40142.716389 |
BOARD OF ELECTION POLL WORKERS | 1.009790 | 0.00 | 2131.199265 |
... | ... | ... | ... |
STATEN ISLAND COMMUNITY BD #2 | 54442.350000 | 0.00 | 60292.117307 |
STATEN ISLAND COMMUNITY BD #3 | 76387.000000 | 0.00 | 23567.010355 |
TAX COMMISSION | 88848.096129 | 0.00 | 51740.416321 |
TAXI & LIMOUSINE COMMISSION | 59062.593797 | 0.08 | 30097.186297 |
TEACHERS RETIREMENT SYSTEM | 80132.776992 | 0.00 | 48630.019747 |
156 rows × 3 columns
Q27: What is the average base salary for anyone with DATABASE
in their title description for 2021?
avg_sal_database = nyc2[nyc2['Title Description'].str.contains("DATABASE", na = False)]
avg_sal_database['Base Salary'].mean()
121565.0
Q28: Select only the 2021 individuals that get paid on a per Annum
basis. Create a Series so that for each Agency, you calculate the range of the highest base salary for that agency minus the lowest base salary for that agency. Be prepared to index into this Series to answer questions.
Hint: I created a custom function similar to what we did in the "Data Aggregation and Grouping using Flags Data" module example.
### ENTER CODE HERE ###
per_annum_employees = nyc2[nyc2['Pay Basis'] == 'PER ANNUM']
output = per_annum_employees.groupby(['Agency Name','First Name','Last Name'])['Base Salary'].max()
output.groupby(level=0).max() - output.groupby(level=0).min()
Agency Name ADMIN FOR CHILDREN'S SVCS 210651.0 ADMIN TRIALS AND HEARINGS 198426.0 BOARD OF CORRECTION 149900.0 BOARD OF ELECTION 204388.0 BOROUGH PRESIDENT-BRONX 178898.0 ... STATEN ISLAND COMMUNITY BD #2 0.0 STATEN ISLAND COMMUNITY BD #3 44770.0 TAX COMMISSION 164966.0 TAXI & LIMOUSINE COMMISSION 211598.0 TEACHERS RETIREMENT SYSTEM 199491.0 Name: Base Salary, Length: 148, dtype: float64
Q29: For 2021, determine what work location and agency combination (group by work location, then by agency) has an average base salary below $100,000
but above $90,000
. How many fall into this category?
Be prepared on your exam to answer about other combinations such as above $50,000 and the agency starts with the letter D
. I am intentionally not telling you everything that will be asked so that you think of a way to be able to pull this information quickly instead of doing it in a much more manual way. Make sure that you review string methods as well.
### ENTER CODE HERE ###
nyc2.groupby(['Work Location Borough','Agency Name']).apply(lambda x: x[(x['Base Salary'] < 100000) & (x['Base Salary'] > 90000)]['Base Salary'].mean()).count()
188